{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "initial_id",
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "os.chdir('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "!pip install transformers==4.32.0 \n",
    "!pip install accelerate==0.25.0 \n",
    "!pip install tiktoken \n",
    "!pip install einops \n",
    "!pip install transformers_stream_generator==0.0.4 \n",
    "!pip install scipy \n",
    "!pip install modelscope \n",
    "!pip install optimum==1.12.0 \n",
    "!pip install peft==0.6.2 \n",
    "!pip install deepspeed==0.12.4 \n",
    "!pip install cohere \n",
    "!pip install torch==2.1.1\n",
    "!pip install pandas==2.1.3\n",
    "!pip install mpi4py\n",
    "!pip install flash-attn --no-build-isolation"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "e874fe1f966d1cad"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from transformers import AutoTokenizer\n",
    "from peft import AutoPeftModelForCausalLM\n",
    "import shutil\n",
    "from modelscope import snapshot_download\n",
    "import subprocess\n",
    "import json\n",
    "import pickle"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "60a90fec3ebd7696"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "def get_model(model_id):\n",
    "    \"\"\"从modelscope拉取模型，并将模型移动到model文件夹内\"\"\"\n",
    "\n",
    "    model_dir = snapshot_download(model_id)\n",
    "    shutil.move(model_dir, 'model')\n",
    "    model_path = os.path.abspath(os.path.join('model', os.path.basename(model_dir)))\n",
    "    return model_path"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "6a6a2e0ed3010245"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "def get_test(test_data, model, tokenizer):\n",
    "    # 使用测试集数据进行推理\n",
    "    def get_response(row):\n",
    "        response, _ = model.chat(tokenizer, row['question'], history=None, system='评估新闻')\n",
    "        return response\n",
    "    test_data['result_new'] = test_data.apply(get_response, axis=1)\n",
    "    # 重置索引并将字符串的json结果分割成为reason, sentiment, impact三列\n",
    "    test_data['result_new'] = test_data['result_new'].map(lambda x: json.loads(x.replace(\"'\", '\"')))\n",
    "    test_data = test_data.reset_index(drop=True)\n",
    "    test_all = pd.concat([test_data, pd.json_normalize(test_data['result_new'])], axis=1)\n",
    "    # 将新的列后面加_new后缀\n",
    "    rename_dict = {col:col+'_new' for col in ['reason', 'sentiment', 'impact']}\n",
    "    test_all = test_all.rename(columns=rename_dict)\n",
    "    # 进行数据格式转换\n",
    "    test_all[['sentiment_new', 'impact_new']] = test_all[['sentiment_new', 'impact_new']].apply(pd.to_numeric)\n",
    "    # 计算情绪分数的误差\n",
    "    def get_sentiment_error(row):\n",
    "        old = row['sentiment_old']\n",
    "        new = row['sentiment_new']\n",
    "        if old * new >= 0:\n",
    "            error = new - old\n",
    "        else:\n",
    "            error = 1 if abs(new - old) <= 1 else abs(new - old)\n",
    "        return error ** 2\n",
    "    # 计算影响力分数的误差\n",
    "    def get_impact_error(row):\n",
    "        old = row['impact_old']\n",
    "        new = row['impact_new']\n",
    "        return abs(new - old) ** 2\n",
    "    test_all['sentiment_se'] = test_all.apply(get_sentiment_error, axis=1)\n",
    "    test_all['sentiment_se_0.1'] = test_all['sentiment_se'] > 0.1\n",
    "    test_all['impact_se'] = test_all.apply(get_impact_error, axis=1)\n",
    "    test_all['impact_se_0.1'] = test_all['impact_se'] > 0.1\n",
    "    return test_all"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "8c538f10ebb8281e"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# 训练文件路径的列表\n",
    "train_path = 'data/train'\n",
    "train_list = [os.path.join(train_path, file) for file in os.listdir(train_path)]\n",
    "# 测试文件的路径\n",
    "test_path = 'data/test'\n",
    "test_list = [pd.read_pickle(os.path.join(test_path, file)) for file in os.listdir(test_path)]"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "ed7886ef8e94b3f5"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "model_path = get_model('qwen/Qwen-1_8B-Chat')\n",
    "# model_path = ''\n",
    "adapter_path = f\"adapter/single\"\n",
    "model_path"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "cc1563da4a7d5aec"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "subprocess.run(\"bash finetune_lora_single_gpu.sh -m  -d data/train/train_00.json -o adapter/single\")"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "4a59b2bc52ee08"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "model = AutoPeftModelForCausalLM.from_pretrained(\n",
    "adapter_path,\n",
    "device_map=\"auto\",\n",
    "trust_remote_code=True).eval()\n",
    "tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)\n",
    "test_list = [get_test(data, model, tokenizer) for data in test_list]\n",
    "with open('outcome.pkl', 'wb') as f:\n",
    "    pickle.dump(test_list, f)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "97d00f376a21ab16"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
